#Import all packages needed
import pandas as pd
import numpy as np
import requests
import tweepy
import json
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#Read CSV file
twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')
twitter_archive.head()
| tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 892420643555336193 | NaN | NaN | 2017-08-01 16:23:56 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Phineas. He's a mystical boy. Only eve... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/892420643... | 13 | 10 | Phineas | None | None | None | None |
| 1 | 892177421306343426 | NaN | NaN | 2017-08-01 00:17:27 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Tilly. She's just checking pup on you.... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/892177421... | 13 | 10 | Tilly | None | None | None | None |
| 2 | 891815181378084864 | NaN | NaN | 2017-07-31 00:18:03 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Archie. He is a rare Norwegian Pouncin... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891815181... | 12 | 10 | Archie | None | None | None | None |
| 3 | 891689557279858688 | NaN | NaN | 2017-07-30 15:58:51 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Darla. She commenced a snooze mid meal... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891689557... | 13 | 10 | Darla | None | None | None | None |
| 4 | 891327558926688256 | NaN | NaN | 2017-07-29 16:00:24 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Franklin. He would like you to stop ca... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891327558... | 12 | 10 | Franklin | None | None | None | None |
url="https://video.udacity-data.com/topher/2018/November/5bf60c69_image-predictions-3/image-predictions-3.tsv"
response = requests.get(url)
with open('image_prediction.tsv', 'wb') as file:
file.write(response.content)
image_prediction = pd.read_csv('image_prediction.tsv', sep='\t')
image_prediction.head()
| tweet_id | jpg_url | img_num | p1 | p1_conf | p1_dog | p2 | p2_conf | p2_dog | p3 | p3_conf | p3_dog | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 666020888022790149 | https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg | 1 | Welsh_springer_spaniel | 0.465074 | True | collie | 0.156665 | True | Shetland_sheepdog | 0.061428 | True |
| 1 | 666029285002620928 | https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg | 1 | redbone | 0.506826 | True | miniature_pinscher | 0.074192 | True | Rhodesian_ridgeback | 0.072010 | True |
| 2 | 666033412701032449 | https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg | 1 | German_shepherd | 0.596461 | True | malinois | 0.138584 | True | bloodhound | 0.116197 | True |
| 3 | 666044226329800704 | https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg | 1 | Rhodesian_ridgeback | 0.408143 | True | redbone | 0.360687 | True | miniature_pinscher | 0.222752 | True |
| 4 | 666049248165822465 | https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg | 1 | miniature_pinscher | 0.560311 | True | Rottweiler | 0.243682 | True | Doberman | 0.154629 | True |
api_key = 'Zqkjh7AJq6hLnL5X6UHfzhBES'
api_key_secret = 'aEqG0URRpopHmE2msx40gBWkqJvGjeV3w6BveAi0gVVCRFHnDi'
bearer_token = 'AAAAAAAAAAAAAAAAAAAAAKWQgQEAAAAAx7ngtZv7r12IB8IfFgibsKHWedo%3DowzJieJ0c022GguMLQTFpJ8p7vSTtxIMiwz6pPJHTO5R9esrrz'
access_token = '206365523-aFjx2vsSDJdDcRsC12gVyWenQHdWOKjWECgA1BDy'
access_token_secret = 'zab4zuB0NP6rx94DWu2Fe108DonaJE3hparfqGA9uTTYq'
auth = tweepy.OAuthHandler(api_key, api_key_secret)
auth.set_access_token = (access_token, access_token_secret)
api = tweepy.API(auth,
parser = tweepy.parsers.JSONParser())
full_status = []
not_found_tweets = []
for tweet_id in twitter_archive['tweet_id']:
try:
full_status.append(api.get_status(tweet_id))
except Exception as e:
not_found_tweets.append(tweet_id)
full_status[0:2]
[{'created_at': 'Tue Aug 01 16:23:56 +0000 2017',
'id': 892420643555336193,
'id_str': '892420643555336193',
'text': "This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU",
'truncated': False,
'entities': {'hashtags': [],
'symbols': [],
'user_mentions': [],
'urls': [],
'media': [{'id': 892420639486877696,
'id_str': '892420639486877696',
'indices': [86, 109],
'media_url': 'http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg',
'media_url_https': 'https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg',
'url': 'https://t.co/MgUWQ76dJU',
'display_url': 'pic.twitter.com/MgUWQ76dJU',
'expanded_url': 'https://twitter.com/dog_rates/status/892420643555336193/photo/1',
'type': 'photo',
'sizes': {'thumb': {'w': 150, 'h': 150, 'resize': 'crop'},
'medium': {'w': 540, 'h': 528, 'resize': 'fit'},
'small': {'w': 540, 'h': 528, 'resize': 'fit'},
'large': {'w': 540, 'h': 528, 'resize': 'fit'}}}]},
'extended_entities': {'media': [{'id': 892420639486877696,
'id_str': '892420639486877696',
'indices': [86, 109],
'media_url': 'http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg',
'media_url_https': 'https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg',
'url': 'https://t.co/MgUWQ76dJU',
'display_url': 'pic.twitter.com/MgUWQ76dJU',
'expanded_url': 'https://twitter.com/dog_rates/status/892420643555336193/photo/1',
'type': 'photo',
'sizes': {'thumb': {'w': 150, 'h': 150, 'resize': 'crop'},
'medium': {'w': 540, 'h': 528, 'resize': 'fit'},
'small': {'w': 540, 'h': 528, 'resize': 'fit'},
'large': {'w': 540, 'h': 528, 'resize': 'fit'}}}]},
'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
'in_reply_to_status_id': None,
'in_reply_to_status_id_str': None,
'in_reply_to_user_id': None,
'in_reply_to_user_id_str': None,
'in_reply_to_screen_name': None,
'user': {'id': 4196983835,
'id_str': '4196983835',
'name': 'WeRateDogs®',
'screen_name': 'dog_rates',
'location': 'all our links ➜',
'description': 'Your Only Source For Professional Dog Ratings Instagram and Facebook ➜ WeRateDogs partnerships@weratedogs.com | nonprofit: @15outof10 ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀',
'url': 'https://t.co/YPc2Xq4Va2',
'entities': {'url': {'urls': [{'url': 'https://t.co/YPc2Xq4Va2',
'expanded_url': 'http://links.weratedogs.com',
'display_url': 'links.weratedogs.com',
'indices': [0, 23]}]},
'description': {'urls': []}},
'protected': False,
'followers_count': 9357103,
'friends_count': 21,
'listed_count': 7614,
'created_at': 'Sun Nov 15 21:41:29 +0000 2015',
'favourites_count': 147566,
'utc_offset': None,
'time_zone': None,
'geo_enabled': True,
'verified': True,
'statuses_count': 16172,
'lang': None,
'contributors_enabled': False,
'is_translator': False,
'is_translation_enabled': False,
'profile_background_color': '000000',
'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png',
'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png',
'profile_background_tile': False,
'profile_image_url': 'http://pbs.twimg.com/profile_images/1552995729014247425/TaJbIdmK_normal.jpg',
'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1552995729014247425/TaJbIdmK_normal.jpg',
'profile_banner_url': 'https://pbs.twimg.com/profile_banners/4196983835/1661991479',
'profile_link_color': 'F5ABB5',
'profile_sidebar_border_color': '000000',
'profile_sidebar_fill_color': '000000',
'profile_text_color': '000000',
'profile_use_background_image': False,
'has_extended_profile': False,
'default_profile': False,
'default_profile_image': False,
'following': None,
'follow_request_sent': None,
'notifications': None,
'translator_type': 'none',
'withheld_in_countries': []},
'geo': None,
'coordinates': None,
'place': None,
'contributors': None,
'is_quote_status': False,
'retweet_count': 6973,
'favorite_count': 33703,
'favorited': False,
'retweeted': False,
'possibly_sensitive': False,
'possibly_sensitive_appealable': False,
'lang': 'en'},
{'created_at': 'Tue Aug 01 00:17:27 +0000 2017',
'id': 892177421306343426,
'id_str': '892177421306343426',
'text': "This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boop… https://t.co/aQFSeaCu9L",
'truncated': True,
'entities': {'hashtags': [],
'symbols': [],
'user_mentions': [],
'urls': [{'url': 'https://t.co/aQFSeaCu9L',
'expanded_url': 'https://twitter.com/i/web/status/892177421306343426',
'display_url': 'twitter.com/i/web/status/8…',
'indices': [117, 140]}]},
'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
'in_reply_to_status_id': None,
'in_reply_to_status_id_str': None,
'in_reply_to_user_id': None,
'in_reply_to_user_id_str': None,
'in_reply_to_screen_name': None,
'user': {'id': 4196983835,
'id_str': '4196983835',
'name': 'WeRateDogs®',
'screen_name': 'dog_rates',
'location': 'all our links ➜',
'description': 'Your Only Source For Professional Dog Ratings Instagram and Facebook ➜ WeRateDogs partnerships@weratedogs.com | nonprofit: @15outof10 ⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀',
'url': 'https://t.co/YPc2Xq4Va2',
'entities': {'url': {'urls': [{'url': 'https://t.co/YPc2Xq4Va2',
'expanded_url': 'http://links.weratedogs.com',
'display_url': 'links.weratedogs.com',
'indices': [0, 23]}]},
'description': {'urls': []}},
'protected': False,
'followers_count': 9357103,
'friends_count': 21,
'listed_count': 7614,
'created_at': 'Sun Nov 15 21:41:29 +0000 2015',
'favourites_count': 147566,
'utc_offset': None,
'time_zone': None,
'geo_enabled': True,
'verified': True,
'statuses_count': 16172,
'lang': None,
'contributors_enabled': False,
'is_translator': False,
'is_translation_enabled': False,
'profile_background_color': '000000',
'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png',
'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png',
'profile_background_tile': False,
'profile_image_url': 'http://pbs.twimg.com/profile_images/1552995729014247425/TaJbIdmK_normal.jpg',
'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1552995729014247425/TaJbIdmK_normal.jpg',
'profile_banner_url': 'https://pbs.twimg.com/profile_banners/4196983835/1661991479',
'profile_link_color': 'F5ABB5',
'profile_sidebar_border_color': '000000',
'profile_sidebar_fill_color': '000000',
'profile_text_color': '000000',
'profile_use_background_image': False,
'has_extended_profile': False,
'default_profile': False,
'default_profile_image': False,
'following': None,
'follow_request_sent': None,
'notifications': None,
'translator_type': 'none',
'withheld_in_countries': []},
'geo': None,
'coordinates': None,
'place': None,
'contributors': None,
'is_quote_status': False,
'retweet_count': 5276,
'favorite_count': 29229,
'favorited': False,
'retweeted': False,
'possibly_sensitive': False,
'possibly_sensitive_appealable': False,
'lang': 'en'}]
len(full_status)
1615
with open('tweet_json.txt', 'w') as file:
file.write(json.dumps(full_status, indent = 2))
full_json = pd.DataFrame(full_status)
full_json.head(1)
| created_at | id | id_str | text | truncated | entities | extended_entities | source | in_reply_to_status_id | in_reply_to_status_id_str | ... | favorite_count | favorited | retweeted | possibly_sensitive | possibly_sensitive_appealable | lang | retweeted_status | quoted_status_id | quoted_status_id_str | quoted_status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Tue Aug 01 16:23:56 +0000 2017 | 892420643555336193 | 892420643555336193 | This is Phineas. He's a mystical boy. Only eve... | False | {'hashtags': [], 'symbols': [], 'user_mentions... | {'media': [{'id': 892420639486877696, 'id_str'... | <a href="http://twitter.com/download/iphone" r... | NaN | None | ... | 33703 | False | False | False | False | en | NaN | NaN | NaN | NaN |
1 rows × 30 columns
full_json.columns
Index(['created_at', 'id', 'id_str', 'text', 'truncated', 'entities',
'extended_entities', 'source', 'in_reply_to_status_id',
'in_reply_to_status_id_str', 'in_reply_to_user_id',
'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo',
'coordinates', 'place', 'contributors', 'is_quote_status',
'retweet_count', 'favorite_count', 'favorited', 'retweeted',
'possibly_sensitive', 'possibly_sensitive_appealable', 'lang',
'retweeted_status', 'quoted_status_id', 'quoted_status_id_str',
'quoted_status'],
dtype='object')
json_tweets = full_json[['id', 'retweet_count','favorite_count']]
json_tweets
| id | retweet_count | favorite_count | |
|---|---|---|---|
| 0 | 892420643555336193 | 6973 | 33703 |
| 1 | 892177421306343426 | 5276 | 29229 |
| 2 | 891815181378084864 | 3465 | 21976 |
| 3 | 891689557279858688 | 7193 | 36799 |
| 4 | 891327558926688256 | 7719 | 35186 |
| ... | ... | ... | ... |
| 1610 | 666049248165822465 | 36 | 88 |
| 1611 | 666044226329800704 | 115 | 246 |
| 1612 | 666033412701032449 | 36 | 100 |
| 1613 | 666029285002620928 | 39 | 112 |
| 1614 | 666020888022790149 | 419 | 2284 |
1615 rows × 3 columns
json_tweets.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1615 entries, 0 to 1614 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 1615 non-null int64 1 retweet_count 1615 non-null int64 2 favorite_count 1615 non-null int64 dtypes: int64(3) memory usage: 38.0 KB
json_tweets.to_csv('json_tweets.csv', header = True, index = False)
#Visual assessment of twitter archive data
twitter_archive
| tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 892420643555336193 | NaN | NaN | 2017-08-01 16:23:56 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Phineas. He's a mystical boy. Only eve... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/892420643... | 13 | 10 | Phineas | None | None | None | None |
| 1 | 892177421306343426 | NaN | NaN | 2017-08-01 00:17:27 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Tilly. She's just checking pup on you.... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/892177421... | 13 | 10 | Tilly | None | None | None | None |
| 2 | 891815181378084864 | NaN | NaN | 2017-07-31 00:18:03 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Archie. He is a rare Norwegian Pouncin... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891815181... | 12 | 10 | Archie | None | None | None | None |
| 3 | 891689557279858688 | NaN | NaN | 2017-07-30 15:58:51 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Darla. She commenced a snooze mid meal... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891689557... | 13 | 10 | Darla | None | None | None | None |
| 4 | 891327558926688256 | NaN | NaN | 2017-07-29 16:00:24 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Franklin. He would like you to stop ca... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891327558... | 12 | 10 | Franklin | None | None | None | None |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2351 | 666049248165822465 | NaN | NaN | 2015-11-16 00:24:50 +0000 | <a href="http://twitter.com/download/iphone" r... | Here we have a 1949 1st generation vulpix. Enj... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666049248... | 5 | 10 | None | None | None | None | None |
| 2352 | 666044226329800704 | NaN | NaN | 2015-11-16 00:04:52 +0000 | <a href="http://twitter.com/download/iphone" r... | This is a purebred Piers Morgan. Loves to Netf... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666044226... | 6 | 10 | a | None | None | None | None |
| 2353 | 666033412701032449 | NaN | NaN | 2015-11-15 23:21:54 +0000 | <a href="http://twitter.com/download/iphone" r... | Here is a very happy pup. Big fan of well-main... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666033412... | 9 | 10 | a | None | None | None | None |
| 2354 | 666029285002620928 | NaN | NaN | 2015-11-15 23:05:30 +0000 | <a href="http://twitter.com/download/iphone" r... | This is a western brown Mitsubishi terrier. Up... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666029285... | 7 | 10 | a | None | None | None | None |
| 2355 | 666020888022790149 | NaN | NaN | 2015-11-15 22:32:08 +0000 | <a href="http://twitter.com/download/iphone" r... | Here we have a Japanese Irish Setter. Lost eye... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666020888... | 8 | 10 | None | None | None | None | None |
2356 rows × 17 columns
#Visual assessment of twitter image prediction
image_prediction
| tweet_id | jpg_url | img_num | p1 | p1_conf | p1_dog | p2 | p2_conf | p2_dog | p3 | p3_conf | p3_dog | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 666020888022790149 | https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg | 1 | Welsh_springer_spaniel | 0.465074 | True | collie | 0.156665 | True | Shetland_sheepdog | 0.061428 | True |
| 1 | 666029285002620928 | https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg | 1 | redbone | 0.506826 | True | miniature_pinscher | 0.074192 | True | Rhodesian_ridgeback | 0.072010 | True |
| 2 | 666033412701032449 | https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg | 1 | German_shepherd | 0.596461 | True | malinois | 0.138584 | True | bloodhound | 0.116197 | True |
| 3 | 666044226329800704 | https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg | 1 | Rhodesian_ridgeback | 0.408143 | True | redbone | 0.360687 | True | miniature_pinscher | 0.222752 | True |
| 4 | 666049248165822465 | https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg | 1 | miniature_pinscher | 0.560311 | True | Rottweiler | 0.243682 | True | Doberman | 0.154629 | True |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2070 | 891327558926688256 | https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg | 2 | basset | 0.555712 | True | English_springer | 0.225770 | True | German_short-haired_pointer | 0.175219 | True |
| 2071 | 891689557279858688 | https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg | 1 | paper_towel | 0.170278 | False | Labrador_retriever | 0.168086 | True | spatula | 0.040836 | False |
| 2072 | 891815181378084864 | https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg | 1 | Chihuahua | 0.716012 | True | malamute | 0.078253 | True | kelpie | 0.031379 | True |
| 2073 | 892177421306343426 | https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg | 1 | Chihuahua | 0.323581 | True | Pekinese | 0.090647 | True | papillon | 0.068957 | True |
| 2074 | 892420643555336193 | https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg | 1 | orange | 0.097049 | False | bagel | 0.085851 | False | banana | 0.076110 | False |
2075 rows × 12 columns
#Visual assessment of twitter json tweets
json_tweets
| id | retweet_count | favorite_count | |
|---|---|---|---|
| 0 | 892420643555336193 | 6973 | 33703 |
| 1 | 892177421306343426 | 5276 | 29229 |
| 2 | 891815181378084864 | 3465 | 21976 |
| 3 | 891689557279858688 | 7193 | 36799 |
| 4 | 891327558926688256 | 7719 | 35186 |
| ... | ... | ... | ... |
| 1610 | 666049248165822465 | 36 | 88 |
| 1611 | 666044226329800704 | 115 | 246 |
| 1612 | 666033412701032449 | 36 | 100 |
| 1613 | 666029285002620928 | 39 | 112 |
| 1614 | 666020888022790149 | 419 | 2284 |
1615 rows × 3 columns
#Getting Twitter Archive columns datatypes and null values
twitter_archive.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2356 entries, 0 to 2355 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 tweet_id 2356 non-null int64 1 in_reply_to_status_id 78 non-null float64 2 in_reply_to_user_id 78 non-null float64 3 timestamp 2356 non-null object 4 source 2356 non-null object 5 text 2356 non-null object 6 retweeted_status_id 181 non-null float64 7 retweeted_status_user_id 181 non-null float64 8 retweeted_status_timestamp 181 non-null object 9 expanded_urls 2297 non-null object 10 rating_numerator 2356 non-null int64 11 rating_denominator 2356 non-null int64 12 name 2356 non-null object 13 doggo 2356 non-null object 14 floofer 2356 non-null object 15 pupper 2356 non-null object 16 puppo 2356 non-null object dtypes: float64(4), int64(3), object(10) memory usage: 313.0+ KB
#Checking if there's duplicated rows in the table
twitter_archive.duplicated().sum()
0
#Checking duplicated values in tweet_id column
twitter_archive.tweet_id.duplicated().sum()
0
#Checking dogs names values for consistency
twitter_archive.name.value_counts()
None 745
a 55
Charlie 12
Cooper 11
Lucy 11
...
Dex 1
Ace 1
Tayzie 1
Grizzie 1
Christoper 1
Name: name, Length: 957, dtype: int64
twitter_archive.name.sort_values().unique()
array(['Abby', 'Ace', 'Acro', 'Adele', 'Aiden', 'Aja', 'Akumi', 'Al',
'Albert', 'Albus', 'Aldrick', 'Alejandro', 'Alexander',
'Alexanderson', 'Alf', 'Alfie', 'Alfy', 'Alice', 'Amber',
'Ambrose', 'Amy', 'Amélie', 'Anakin', 'Andru', 'Andy', 'Angel',
'Anna', 'Anthony', 'Antony', 'Apollo', 'Aqua', 'Archie', 'Arlen',
'Arlo', 'Arnie', 'Arnold', 'Arya', 'Ash', 'Asher', 'Ashleigh',
'Aspen', 'Astrid', 'Atlas', 'Atticus', 'Aubie', 'Augie', 'Autumn',
'Ava', 'Axel', 'Bailey', 'Baloo', 'Balto', 'Banditt', 'Banjo',
'Barclay', 'Barney', 'Baron', 'Barry', 'Batdog', 'Bauer', 'Baxter',
'Bayley', 'BeBe', 'Bear', 'Beau', 'Beckham', 'Beebop', 'Beemo',
'Bell', 'Bella', 'Belle', 'Ben', 'Benedict', 'Benji', 'Benny',
'Bentley', 'Berb', 'Berkeley', 'Bernie', 'Bert', 'Bertson',
'Betty', 'Beya', 'Biden', 'Bilbo', 'Billl', 'Billy', 'Binky',
'Birf', 'Bisquick', 'Blakely', 'Blanket', 'Blipson', 'Blitz',
'Bloo', 'Bloop', 'Blu', 'Blue', 'Bluebert', 'Bo', 'Bob', 'Bobb',
'Bobbay', 'Bobble', 'Bobby', 'Bode', 'Bodie', 'Bonaparte', 'Bones',
'Bookstore', 'Boomer', 'Boots', 'Boston', 'Bowie', 'Brad',
'Bradlay', 'Bradley', 'Brady', 'Brandi', 'Brandonald', 'Brandy',
'Brat', 'Brian', 'Brockly', 'Brody', 'Bronte', 'Brooks', 'Brownie',
'Bruce', 'Brudge', 'Bruiser', 'Bruno', 'Brutus', 'Bubba',
'Bubbles', 'Buckley', 'Buddah', 'Buddy', 'Bungalo', 'Burt',
'Butter', 'Butters', 'Cal', 'Calbert', 'Cali', 'Callie', 'Calvin',
'Canela', 'Cannon', 'Carbon', 'Carl', 'Carll', 'Carly', 'Carper',
'Carter', 'Caryl', 'Cash', 'Cassie', 'CeCe', 'Cecil', 'Cedrick',
'Cermet', 'Chadrick', 'Champ', 'Charl', 'Charles', 'Charleson',
'Charlie', 'Chase', 'Chaz', 'Cheesy', 'Chef', 'Chelsea', 'Cheryl',
'Chesney', 'Chester', 'Chesterson', 'Chet', 'Chevy', 'Chip',
'Chipson', 'Chloe', 'Chompsky', 'Christoper', 'Chubbs', 'Chuck',
'Chuckles', 'Chuq', 'Churlie', 'Cilantro', 'Clarence', 'Clark',
'Clarkus', 'Clarq', 'Claude', 'Cleopatricia', 'Clifford', 'Clybe',
'Clyde', 'Coco', 'Cody', 'Colby', 'Coleman', 'Colin', 'Combo',
'Comet', 'Cooper', 'Coops', 'Coopson', 'Cora', 'Corey', 'Covach',
'Craig', 'Crawford', 'Creg', 'Crimson', 'Crouton', 'Crumpet',
'Crystal', 'Cuddles', 'Cupcake', 'Cupid', 'Curtis', 'Daisy',
'Dakota', 'Dale', 'Dallas', 'Damon', 'Daniel', 'Danny', 'Dante',
'Darby', 'Darla', 'Darrel', 'Dash', 'Dave', 'Davey', 'Dawn',
'DayZ', 'Deacon', 'Derby', 'Derek', 'Devón', 'Dewey', 'Dex',
'Dexter', 'Dido', 'Dietrich', 'Diogi', 'Divine', 'Dixie', 'Django',
'Dobby', 'Doc', 'DonDon', 'Donny', 'Doobert', 'Dook', 'Dot',
'Dotsy', 'Doug', 'Duchess', 'Duddles', 'Dudley', 'Dug', 'Duke',
'Dunkin', 'Durg', 'Dutch', 'Dwight', 'Dylan', 'Earl', 'Eazy',
'Ebby', 'Ed', 'Edd', 'Edgar', 'Edmund', 'Eevee', 'Einstein',
'Eleanor', 'Eli', 'Ellie', 'Elliot', 'Emanuel', 'Ember', 'Emma',
'Emmie', 'Emmy', 'Enchilada', 'Erik', 'Eriq', 'Ester', 'Eugene',
'Eve', 'Evy', 'Fabio', 'Farfle', 'Ferg', 'Fido', 'Fiji', 'Fillup',
'Filup', 'Finley', 'Finn', 'Finnegus', 'Fiona', 'Fizz', 'Flash',
'Fletcher', 'Florence', 'Flurpson', 'Flávio', 'Frank', 'Frankie',
'Franklin', 'Franq', 'Fred', 'Freddery', 'Frönq', 'Furzey', 'Fwed',
'Fynn', 'Gabby', 'Gabe', 'Gary', 'General', 'Genevieve', 'Geno',
'Geoff', 'George', 'Georgie', 'Gerald', 'Gerbald', 'Gert',
'Gidget', 'Gilbert', 'Gin', 'Ginger', 'Gizmo', 'Glacier', 'Glenn',
'Godi', 'Godzilla', 'Goliath', 'Goose', 'Gordon', 'Grady', 'Grey',
'Griffin', 'Griswold', 'Grizz', 'Grizzie', 'Grizzwald', 'Gromit',
'Gunner', 'Gus', 'Gustaf', 'Gustav', 'Gòrdón', 'Hall', 'Halo',
'Hammond', 'Hamrick', 'Hank', 'Hanz', 'Happy', 'Harlso', 'Harnold',
'Harold', 'Harper', 'Harrison', 'Harry', 'Harvey', 'Hazel',
'Hector', 'Heinrich', 'Henry', 'Herald', 'Herb', 'Hercules',
'Herm', 'Hermione', 'Hero', 'Herschel', 'Hobbes', 'Holly',
'Horace', 'Howie', 'Hubertson', 'Huck', 'Humphrey', 'Hunter',
'Hurley', 'Huxley', 'Iggy', 'Ike', 'Indie', 'Iroh', 'Ito', 'Ivar',
'Izzy', 'JD', 'Jack', 'Jackie', 'Jackson', 'Jameson', 'Jamesy',
'Jangle', 'Jareld', 'Jarod', 'Jarvis', 'Jaspers', 'Jax', 'Jay',
'Jaycob', 'Jazz', 'Jazzy', 'Jeb', 'Jebberson', 'Jed', 'Jeffrey',
'Jeffri', 'Jeffrie', 'Jennifur', 'Jeph', 'Jeremy', 'Jerome',
'Jerry', 'Jersey', 'Jesse', 'Jessifer', 'Jessiga', 'Jett', 'Jim',
'Jimbo', 'Jiminus', 'Jiminy', 'Jimison', 'Jimothy', 'Jo',
'Jockson', 'Joey', 'Jomathan', 'Jonah', 'Jordy', 'Josep', 'Joshwa',
'Juckson', 'Julio', 'Julius', 'Juno', 'Kaia', 'Kaiya', 'Kallie',
'Kane', 'Kanu', 'Kara', 'Karl', 'Karll', 'Karma', 'Kathmandu',
'Katie', 'Kawhi', 'Kayla', 'Keet', 'Keith', 'Kellogg', 'Ken',
'Kendall', 'Kenneth', 'Kenny', 'Kenzie', 'Keurig', 'Kevin',
'Kevon', 'Kial', 'Kilo', 'Kingsley', 'Kirby', 'Kirk', 'Klein',
'Klevin', 'Kloey', 'Kobe', 'Koda', 'Kody', 'Koko', 'Kollin',
'Kona', 'Kota', 'Kramer', 'Kreg', 'Kreggory', 'Kulet', 'Kuyu',
'Kyle', 'Kyro', 'Lacy', 'Laela', 'Laika', 'Lambeau', 'Lance',
'Larry', 'Lassie', 'Layla', 'Leela', 'Lennon', 'Lenny', 'Lenox',
'Leo', 'Leonard', 'Leonidas', 'Levi', 'Liam', 'Lilah', 'Lili',
'Lilli', 'Lillie', 'Lilly', 'Lily', 'Lincoln', 'Linda', 'Link',
'Linus', 'Lipton', 'Livvie', 'Lizzie', 'Logan', 'Loki', 'Lola',
'Lolo', 'Longfellow', 'Loomis', 'Lorelei', 'Lorenzo', 'Lou',
'Louie', 'Louis', 'Luca', 'Lucia', 'Lucky', 'Lucy', 'Lugan',
'Lulu', 'Luna', 'Lupe', 'Luther', 'Mabel', 'Mac', 'Mack', 'Maddie',
'Maggie', 'Mairi', 'Maisey', 'Major', 'Maks', 'Malcolm', 'Malikai',
'Margo', 'Mark', 'Marlee', 'Marley', 'Marq', 'Marty', 'Marvin',
'Mary', 'Mason', 'Mattie', 'Maude', 'Mauve', 'Max', 'Maxaroni',
'Maximus', 'Maxwell', 'Maya', 'Meatball', 'Meera', 'Meyer', 'Mia',
'Michelangelope', 'Miguel', 'Mike', 'Miley', 'Milky', 'Millie',
'Milo', 'Mimosa', 'Mingus', 'Mister', 'Misty', 'Mitch', 'Mo',
'Moe', 'Mojo', 'Mollie', 'Molly', 'Mona', 'Monkey', 'Monster',
'Monty', 'Moofasa', 'Mookie', 'Moose', 'Moreton', 'Mosby',
'Murphy', 'Mutt', 'Mya', 'Nala', 'Naphaniel', 'Napolean', 'Nelly',
'Neptune', 'Newt', 'Nico', 'Nida', 'Nigel', 'Nimbus', 'Noah',
'Nollie', 'None', 'Noosh', 'Norman', 'Nugget', 'O', 'Oakley',
'Obi', 'Obie', 'Oddie', 'Odie', 'Odin', 'Olaf', 'Ole', 'Olive',
'Oliver', 'Olivia', 'Oliviér', 'Ollie', 'Opal', 'Opie', 'Oreo',
'Orion', 'Oscar', 'Oshie', 'Otis', 'Ozzie', 'Ozzy', 'Pablo',
'Paisley', 'Pancake', 'Panda', 'Patch', 'Patrick', 'Paull',
'Pavlov', 'Pawnd', 'Peaches', 'Peanut', 'Penelope', 'Penny',
'Pepper', 'Percy', 'Perry', 'Pete', 'Petrick', 'Pherb', 'Phil',
'Philbert', 'Philippe', 'Phineas', 'Phred', 'Pickles', 'Pilot',
'Pinot', 'Pip', 'Piper', 'Pippa', 'Pippin', 'Pipsy', 'Pluto',
'Poppy', 'Pubert', 'Puff', 'Pumpkin', 'Pupcasso', 'Quinn', 'Ralf',
'Ralph', 'Ralpher', 'Ralphie', 'Ralphson', 'Ralphus', 'Ralphy',
'Ralphé', 'Rambo', 'Randall', 'Raphael', 'Rascal', 'Raymond',
'Reagan', 'Reese', 'Reggie', 'Reginald', 'Remington', 'Remus',
'Remy', 'Reptar', 'Rey', 'Rhino', 'Richie', 'Ricky', 'Ridley',
'Riley', 'Rilo', 'Rinna', 'River', 'Rizzo', 'Rizzy', 'Robin',
'Rocco', 'Rocky', 'Rodman', 'Rodney', 'Rolf', 'Romeo', 'Ron',
'Ronduh', 'Ronnie', 'Rontu', 'Rooney', 'Roosevelt', 'Rorie',
'Rory', 'Roscoe', 'Rose', 'Rosie', 'Rover', 'Rubio', 'Ruby',
'Rudy', 'Rueben', 'Ruffles', 'Rufio', 'Rufus', 'Rumble', 'Rumpole',
'Rupert', 'Rusty', 'Sadie', 'Sage', 'Sailer', 'Sailor', 'Sam',
'Sammy', 'Sampson', 'Samsom', 'Samson', 'Sandra', 'Sandy', 'Sansa',
'Sarge', 'Saydee', 'Schnitzel', 'Schnozz', 'Scooter', 'Scott',
'Scout', 'Scruffers', 'Seamus', 'Sebastian', 'Sephie', 'Severus',
'Shadoe', 'Shadow', 'Shaggy', 'Shakespeare', 'Shawwn', 'Shelby',
'Shikha', 'Shiloh', 'Shnuggles', 'Shooter', 'Siba', 'Sid',
'Sierra', 'Simba', 'Skittle', 'Skittles', 'Sky', 'Skye', 'Smiley',
'Smokey', 'Snickers', 'Snicku', 'Snoop', 'Snoopy', 'Sobe', 'Socks',
'Sojourner', 'Solomon', 'Sonny', 'Sophie', 'Sora', 'Spanky',
'Spark', 'Sparky', 'Spencer', 'Sprinkles', 'Sprout', 'Staniel',
'Stanley', 'Stark', 'Stefan', 'Stella', 'Stephan', 'Stephanus',
'Steve', 'Steven', 'Stewie', 'Storkson', 'Stormy', 'Strider',
'Striker', 'Strudel', 'Stu', 'Stuart', 'Stubert', 'Sugar', 'Suki',
'Sully', 'Sundance', 'Sunny', 'Sunshine', 'Superpup', 'Swagger',
'Sweet', 'Sweets', 'Taco', 'Tango', 'Tanner', 'Tassy', 'Tater',
'Tayzie', 'Taz', 'Tebow', 'Ted', 'Tedders', 'Teddy', 'Tedrick',
'Terrance', 'Terrenth', 'Terry', 'Tess', 'Tessa', 'Theo',
'Theodore', 'Thor', 'Thumas', 'Tiger', 'Tilly', 'Timber',
'Timison', 'Timmy', 'Timofy', 'Tino', 'Titan', 'Tito', 'Tobi',
'Toby', 'Todo', 'Toffee', 'Tom', 'Tommy', 'Tonks', 'Torque',
'Tove', 'Travis', 'Traviss', 'Trevith', 'Trigger', 'Trip', 'Tripp',
'Trooper', 'Tuck', 'Tucker', 'Tuco', 'Tug', 'Tupawc', 'Tycho',
'Tyr', 'Tyrone', 'Tyrus', 'Ulysses', 'Venti', 'Vince', 'Vincent',
'Vinnie', 'Vinscent', 'Vixen', 'Wafer', 'Waffles', 'Walker',
'Wallace', 'Wally', 'Walter', 'Watson', 'Wesley', 'Wiggles',
'Willem', 'William', 'Willie', 'Willow', 'Willy', 'Wilson',
'Winifred', 'Winnie', 'Winston', 'Wishes', 'Wyatt', 'Yoda', 'Yogi',
'Yukon', 'Zara', 'Zeek', 'Zeke', 'Zeus', 'Ziva', 'Zoe', 'Zoey',
'Zooey', 'Zuzu', 'a', 'actually', 'all', 'an', 'by', 'getting',
'his', 'incredibly', 'infuriating', 'just', 'life', 'light', 'mad',
'my', 'not', 'officially', 'old', 'one', 'quite', 'space', 'such',
'the', 'this', 'unacceptable', 'very'], dtype=object)
#Check how many rating have denominator other than 10
len(twitter_archive.query('rating_denominator != 10'))
23
#Checking tweets with denominator other than 10
twitter_archive.query('rating_denominator != 10')[['text', 'rating_denominator']]
| text | rating_denominator | |
|---|---|---|
| 313 | @jonnysun @Lin_Manuel ok jomny I know you're e... | 0 |
| 342 | @docmisterio account started on 11/15/15 | 15 |
| 433 | The floofs have been released I repeat the flo... | 70 |
| 516 | Meet Sam. She smiles 24/7 & secretly aspir... | 7 |
| 784 | RT @dog_rates: After so many requests, this is... | 11 |
| 902 | Why does this never happen at my front door...... | 150 |
| 1068 | After so many requests, this is Bretagne. She ... | 11 |
| 1120 | Say hello to this unbelievably well behaved sq... | 170 |
| 1165 | Happy 4/20 from the squad! 13/10 for all https... | 20 |
| 1202 | This is Bluebert. He just saw that both #Final... | 50 |
| 1228 | Happy Saturday here's 9 puppers on a bench. 99... | 90 |
| 1254 | Here's a brigade of puppers. All look very pre... | 80 |
| 1274 | From left to right:\nCletus, Jerome, Alejandro... | 50 |
| 1351 | Here is a whole flock of puppers. 60/50 I'll ... | 50 |
| 1433 | Happy Wednesday here's a bucket of pups. 44/40... | 40 |
| 1598 | Yes I do realize a rating of 4/20 would've bee... | 20 |
| 1634 | Two sneaky puppers were not initially seen, mo... | 130 |
| 1635 | Someone help the girl is being mugged. Several... | 110 |
| 1662 | This is Darrel. He just robbed a 7/11 and is i... | 11 |
| 1663 | I'm aware that I could've said 20/16, but here... | 16 |
| 1779 | IT'S PUPPERGEDDON. Total of 144/120 ...I think... | 120 |
| 1843 | Here we have an entire platoon of puppers. Tot... | 80 |
| 2335 | This is an Albanian 3 1/2 legged Episcopalian... | 2 |
twitter_archive.rating_numerator.min() , twitter_archive.rating_numerator.max()
(0, 1776)
twitter_archive.rating_numerator.describe()
count 2356.000000 mean 13.126486 std 45.876648 min 0.000000 25% 10.000000 50% 11.000000 75% 12.000000 max 1776.000000 Name: rating_numerator, dtype: float64
#Checking Outliers of the numerator (any rating greater than 15 or less than 6)
import plotly.express as px
px.box(twitter_archive, y='rating_numerator')
#checking how many outliers are in the numerator column
len(twitter_archive.query('(rating_numerator > 15) | (rating_numerator < 6)' ))
119
twitter_archive.query('(rating_numerator > 15) | (rating_numerator < 6)')[['text', 'rating_numerator']]
| text | rating_numerator | |
|---|---|---|
| 45 | This is Bella. She hopes her smile made you sm... | 5 |
| 55 | @roushfenway These are good dogs but 17/10 is ... | 17 |
| 188 | @dhmontgomery We also gave snoop dogg a 420/10... | 420 |
| 189 | @s8n You tried very hard to portray this good ... | 666 |
| 290 | @markhoppus 182/10 | 182 |
| ... | ... | ... |
| 2334 | This is a funny dog. Weird toes. Won't come do... | 3 |
| 2335 | This is an Albanian 3 1/2 legged Episcopalian... | 1 |
| 2338 | Not familiar with this breed. No tail (weird).... | 1 |
| 2349 | This is an odd dog. Hard on the outside but lo... | 2 |
| 2351 | Here we have a 1949 1st generation vulpix. Enj... | 5 |
119 rows × 2 columns
#Checking tweets source values for consistency
twitter_archive.source.value_counts()
<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> 2221 <a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a> 91 <a href="http://twitter.com" rel="nofollow">Twitter Web Client</a> 33 <a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a> 11 Name: source, dtype: int64
#Checking doggo column values for consistency and understanding the data
twitter_archive['doggo'].value_counts()
None 2259 doggo 97 Name: doggo, dtype: int64
#Checking floofer column values for consistency and understanding the data
twitter_archive['floofer'].value_counts()
None 2346 floofer 10 Name: floofer, dtype: int64
#Checking pupper column values for consistency and understanding the data
twitter_archive['pupper'].value_counts()
None 2099 pupper 257 Name: pupper, dtype: int64
#Checking puppo column values for consistency and understanding the data
twitter_archive['puppo'].value_counts()
None 2326 puppo 30 Name: puppo, dtype: int64
#Getting Image Prediction columns datatypes and null values
image_prediction.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2075 entries, 0 to 2074 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 tweet_id 2075 non-null int64 1 jpg_url 2075 non-null object 2 img_num 2075 non-null int64 3 p1 2075 non-null object 4 p1_conf 2075 non-null float64 5 p1_dog 2075 non-null bool 6 p2 2075 non-null object 7 p2_conf 2075 non-null float64 8 p2_dog 2075 non-null bool 9 p3 2075 non-null object 10 p3_conf 2075 non-null float64 11 p3_dog 2075 non-null bool dtypes: bool(3), float64(3), int64(2), object(4) memory usage: 152.1+ KB
#Checking if there's duplicated rows in the table
image_prediction.duplicated().sum()
0
#Checking duplicated values in tweet_id column
image_prediction.tweet_id.duplicated().sum()
0
#Checking values of p1 column
image_prediction.p1.value_counts()
golden_retriever 150
Labrador_retriever 100
Pembroke 89
Chihuahua 83
pug 57
...
pillow 1
carousel 1
bald_eagle 1
lorikeet 1
orange 1
Name: p1, Length: 378, dtype: int64
#Getting all values of p1 column
image_prediction.p1.unique()
array(['Welsh_springer_spaniel', 'redbone', 'German_shepherd',
'Rhodesian_ridgeback', 'miniature_pinscher',
'Bernese_mountain_dog', 'box_turtle', 'chow', 'shopping_cart',
'miniature_poodle', 'golden_retriever', 'Gordon_setter',
'Walker_hound', 'pug', 'bloodhound', 'Lhasa', 'English_setter',
'hen', 'desktop_computer', 'Italian_greyhound', 'Maltese_dog',
'three-toed_sloth', 'ox', 'malamute', 'guinea_pig',
'soft-coated_wheaten_terrier', 'Chihuahua',
'black-and-tan_coonhound', 'coho', 'toy_terrier',
'Blenheim_spaniel', 'Pembroke', 'llama',
'Chesapeake_Bay_retriever', 'curly-coated_retriever', 'dalmatian',
'Ibizan_hound', 'Border_collie', 'Labrador_retriever', 'seat_belt',
'snail', 'miniature_schnauzer', 'Airedale', 'triceratops', 'swab',
'hay', 'hyena', 'jigsaw_puzzle', 'West_Highland_white_terrier',
'toy_poodle', 'giant_schnauzer', 'vizsla', 'vacuum', 'Rottweiler',
'Siberian_husky', 'teddy', 'papillon', 'Saint_Bernard',
'porcupine', 'goose', 'Tibetan_terrier', 'borzoi', 'beagle',
'hare', 'Yorkshire_terrier', 'Pomeranian', 'electric_fan',
'web_site', 'ibex', 'kuvasz', 'fire_engine', 'lorikeet',
'flat-coated_retriever', 'toyshop', 'common_iguana',
'Norwegian_elkhound', 'frilled_lizard', 'leatherback_turtle',
'hamster', 'Angora', 'Arctic_fox', 'trombone', 'canoe',
'king_penguin', 'shopping_basket', 'standard_poodle',
'Staffordshire_bullterrier', 'basenji', 'Lakeland_terrier',
'American_Staffordshire_terrier', 'bearskin', 'Shih-Tzu',
'bustard', 'crash_helmet', 'French_bulldog', 'Pekinese',
'komondor', 'ski_mask', 'malinois', 'kelpie', 'Brittany_spaniel',
'cocker_spaniel', 'shower_curtain', 'basset', 'jellyfish',
'doormat', 'Arabian_camel', 'lynx', 'hog', 'comic_book', 'minivan',
'seashore', 'cuirass', 'Brabancon_griffon', 'candle', 'Eskimo_dog',
'weasel', 'Christmas_stocking', 'washbasin', 'car_mirror',
'piggy_bank', 'pot', 'boathouse', 'mud_turtle',
'German_short-haired_pointer', 'Shetland_sheepdog',
'Irish_terrier', 'cairn', 'platypus', 'English_springer',
'whippet', 'ping-pong_ball', 'sea_urchin', 'bow_tie',
'window_shade', "jack-o'-lantern", 'sorrel', 'Sussex_spaniel',
'peacock', 'axolotl', 'wool', 'banana', 'Dandie_Dinmont',
'Norwich_terrier', 'wood_rabbit', 'dhole', 'keeshond',
'Norfolk_terrier', 'lacewing', 'dingo', 'brown_bear',
'Old_English_sheepdog', 'scorpion', 'flamingo', 'microphone',
'Samoyed', 'pitcher', 'African_hunting_dog', 'refrigerator',
'picket_fence', 'tub', 'zebra', 'hermit_crab', 'swing', 'Doberman',
'park_bench', 'feather_boa', 'Loafer', 'stone_wall', 'ice_bear',
'prayer_rug', 'chimpanzee', 'china_cabinet', 'bee_eater',
'tennis_ball', 'carton', 'killer_whale', 'ostrich', 'terrapin',
'Siamese_cat', 'gondola', 'Great_Pyrenees', 'microwave',
'starfish', 'sandbar', 'tusker', 'motor_scooter', 'ram',
'leaf_beetle', 'wombat', 'schipperke', 'Newfoundland',
'bull_mastiff', 'water_bottle', 'suit', 'toilet_seat', 'collie',
'robin', 'Cardigan', 'Greater_Swiss_Mountain_dog', 'slug',
'toilet_tissue', 'acorn_squash', 'soccer_ball',
'African_crocodile', 'tick', 'ocarina', 'boxer', 'street_sign',
'bow', 'stove', 'paper_towel', 'upright', 'dough',
'Scottish_deerhound', 'bath_towel', 'standard_schnauzer',
'walking_stick', 'Irish_water_spaniel', 'bubble', 'Boston_bull',
'book_jacket', 'rain_barrel', 'black-footed_ferret', 'guenon',
'Japanese_spaniel', 'water_buffalo', 'patio', 'cowboy_hat',
'dogsled', 'maze', 'harp', 'panpipe', 'cash_machine', 'mailbox',
'wallaby', 'EntleBucher', 'earthstar', 'pillow', 'bluetick',
'space_heater', 'carousel', 'Irish_setter', 'birdhouse', 'snorkel',
'bald_eagle', 'koala', 'Leonberg', 'cheetah', 'minibus',
'Weimaraner', 'clog', 'dishwasher', 'white_wolf', 'sliding_door',
'damselfly', 'Great_Dane', 'Tibetan_mastiff', 'cheeseburger',
'fiddler_crab', 'bannister', 'crane', 'Scotch_terrier',
'snowmobile', 'badger', 'bighorn', 'geyser', 'barrow', 'bison',
'Mexican_hairless', 'ice_lolly', 'sea_lion', 'dining_table',
'groenendael', 'Australian_terrier', 'beaver', 'briard',
'Appenzeller', 'grey_fox', 'mousetrap', 'hippopotamus',
'Border_terrier', 'hummingbird', 'tailed_frog', 'otter',
'Egyptian_cat', 'four-poster', 'wild_boar', 'bathtub', 'agama',
'muzzle', 'hotdog', 'bib', 'espresso', 'timber_wolf', 'meerkat',
'nail', 'hammer', 'home_theater', 'alp', 'bonnet', 'handkerchief',
'hand_blower', 'polecat', 'lakeside', 'studio_couch', 'cup',
'cliff', 'Bedlington_terrier', 'lawn_mower', 'balloon',
'sunglasses', 'rapeseed', 'traffic_light', 'coil', 'binoculars',
'paddle', 'tiger_shark', 'sulphur-crested_cockatoo',
'wire-haired_fox_terrier', 'Saluki', 'American_black_bear',
'rotisserie', 'conch', 'skunk', 'bookshop', 'radio_telescope',
'cougar', 'African_grey', 'coral_reef', 'lion', 'maillot',
'Madagascar_cat', 'tabby', 'silky_terrier', 'giant_panda',
'long-horned_beetle', 'Afghan_hound', 'clumber', 'sundial',
'padlock', 'pool_table', 'quilt', 'beach_wagon', 'remote_control',
'bakery', 'pedestal', 'gas_pump', 'bookcase', 'shield', 'loupe',
'restaurant', 'prison', 'school_bus', 'cowboy_boot', 'jersey',
'wooden_spoon', 'leopard', 'mortarboard', 'teapot',
'military_uniform', 'washer', 'coffee_mug', 'fountain',
'pencil_box', 'barbell', 'grille', 'revolver', 'envelope',
'syringe', 'marmot', 'pole', 'laptop', 'basketball', 'tricycle',
'convertible', 'limousine', 'orange'], dtype=object)
#Checking some odd data
image_prediction.query('p1 == "shopping_cart"').jpg_url
8 https://pbs.twimg.com/media/CT5PY90WoAAQGLo.jpg 703 https://pbs.twimg.com/media/CYFOP6cWEAAWp-k.jpg 1432 https://pbs.twimg.com/media/CrtYRMEWIAAUkCl.jpg 1760 https://pbs.twimg.com/media/C3YaSnQWAAILgz0.jpg 1834 https://pbs.twimg.com/media/C52V7PzWcAA_pVv.jpg Name: jpg_url, dtype: object

#Checking values of p2 column
image_prediction.p2.value_counts()
Labrador_retriever 104
golden_retriever 92
Cardigan 73
Chihuahua 44
Pomeranian 42
...
medicine_chest 1
quail 1
horse_cart 1
waffle_iron 1
bagel 1
Name: p2, Length: 405, dtype: int64
#Getting all values of p2 column
image_prediction.p2.unique()
array(['collie', 'miniature_pinscher', 'malinois', 'redbone',
'Rottweiler', 'English_springer', 'mud_turtle', 'Tibetan_mastiff',
'shopping_basket', 'komondor', 'Yorkshire_terrier',
'English_foxhound', 'bull_mastiff', 'German_shepherd', 'Shih-Tzu',
'Newfoundland', 'cock', 'desk', 'toy_terrier', 'toy_poodle',
'otter', 'Chesapeake_Bay_retriever', 'Siberian_husky', 'skunk',
'Afghan_hound', 'bloodhound', 'barracouta', 'papillon',
'cocker_spaniel', 'chow', 'Irish_terrier', 'chain_saw', 'beagle',
'giant_schnauzer', 'Labrador_retriever', 'Pembroke', 'Chihuahua',
'Weimaraner', 'slug', 'Brittany_spaniel', 'standard_schnauzer',
'teddy', 'armadillo', 'African_hunting_dog', 'vizsla', 'doormat',
'pug', 'Italian_greyhound', 'Samoyed', 'Pomeranian',
'miniature_poodle', 'Lakeland_terrier', 'Irish_setter', 'swab',
'malamute', 'bath_towel', 'Border_collie', 'Leonberg', 'drake',
'French_bulldog', 'ice_bear', 'Christmas_stocking',
'golden_retriever', 'standard_poodle', 'dhole', 'kuvasz',
'Cardigan', 'silky_terrier', 'spotlight', 'dishwasher', 'bighorn',
'tow_truck', 'hummingbird', 'English_setter', 'prayer_rug',
'frilled_lizard', 'Pekinese', 'ox', 'boxer', 'hog', 'guinea_pig',
'hen', 'wallaby', 'cowboy_boot', 'cornet', 'minivan', 'paddle',
'basset', 'hamper', 'Bedlington_terrier', 'Shetland_sheepdog',
'bow', 'Lhasa', 'pelican', 'toaster', 'groenendael',
'Australian_terrier', 'llama', 'knee_pad', 'pillow',
'Ibizan_hound', 'Old_English_sheepdog', 'Welsh_springer_spaniel',
'coral_reef', 'bison', 'waffle_iron', 'tabby', 'bib', 'police_van',
'Eskimo_dog', 'breastplate', 'German_short-haired_pointer',
'Norfolk_terrier', 'Blenheim_spaniel', 'pickup',
'miniature_schnauzer', 'lampshade', 'Tibetan_terrier',
'Siamese_cat', 'borzoi', 'studio_couch', 'toilet_seat', 'hamster',
'seat_belt', 'keeshond', 'koala', 'hair_spray', 'Saint_Bernard',
'tray', 'American_Staffordshire_terrier', 'birdhouse', 'terrapin',
'Staffordshire_bullterrier', 'briard',
'West_Highland_white_terrier', 'spotted_salamander', 'tennis_ball',
'porcupine', 'cardigan', 'corn', 'basenji', 'otterhound',
'European_gallinule', 'indri', 'tailed_frog', 'beach_wagon',
'siamang', 'orange', 'home_theater', 'cairn', 'hare',
'Norwegian_elkhound', 'Brabancon_griffon', 'American_black_bear',
'sulphur_butterfly', 'Sealyham_terrier', 'Walker_hound',
'tarantula', 'Persian_cat', 'coral_fungus', 'accordion', 'kelpie',
'Great_Pyrenees', 'wood_rabbit', 'black-and-tan_coonhound',
'sunglasses', 'plow', 'whippet', 'rain_barrel', 'bathtub', 'tiger',
'snail', 'tick', 'wire-haired_fox_terrier', 'water_bottle', 'wig',
'platypus', 'Irish_wolfhound', 'ram', 'gorilla',
'entertainment_center', 'toucan', 'mask', 'shopping_cart',
'Sussex_spaniel', 'crate', 'grey_whale', 'badger', 'Airedale',
'Arabian_camel', 'cockroach', 'lifeboat', 'rotisserie', 'goldfish',
'stingray', 'warthog', 'bobsled', 'rhinoceros_beetle', 'beaver',
'brown_bear', 'Maltese_dog', 'weasel', 'quill',
'Rhodesian_ridgeback', 'Arctic_fox', 'ashcan', 'bow_tie',
'soft-coated_wheaten_terrier', 'schipperke', 'bearskin',
'Kerry_blue_terrier', 'ice_lolly', 'American_alligator',
'mosquito_net', 'sea_lion', 'Boston_bull', 'nail',
'black-footed_ferret', 'promontory', 'sarong', 'Doberman',
'space_heater', 'Great_Dane', 'mailbox', 'Saluki', 'bakery',
'sandal', 'leafhopper', 'barrel', 'water_buffalo', 'polecat',
'macaque', 'Japanese_spaniel', 'folding_chair', 'trench_coat',
'Angora', 'junco', 'crib', 'dalmatian', 'snowmobile',
'flat-coated_retriever', 'streetcar', 'window_screen', 'bannister',
'hair_slide', 'meerkat', 'cannon', 'jaguar', 'Border_terrier',
'Greater_Swiss_Mountain_dog', 'hay', 'apron', 'cloak', 'radiator',
'muzzle', 'feather_boa', 'rifle', 'chimpanzee', 'loggerhead',
'torch', 'Mexican_hairless', 'spindle', 'triceratops',
'Appenzeller', 'stove', 'dingo', 'oscilloscope', 'common_newt',
'hotdog', 'medicine_chest', 'quail', 'horse_cart', 'four-poster',
'pier', 'red_fox', 'affenpinscher', 'assault_rifle',
'mashed_potato', 'moped', 'hyena', 'seashore', 'tub', 'sports_car',
'swing', 'mink', 'bluetick', 'neck_brace', 'grey_fox', 'mongoose',
'fur_coat', 'Scotch_terrier', 'spatula', 'paper_towel', 'shoji',
'toyshop', 'banded_gecko', 'peacock', 'crossword_puzzle',
'tree_frog', 'wombat', 'turnstile', 'sleeping_bag', 'quilt',
'Gila_monster', 'giant_panda', 'Dandie_Dinmont', 'handkerchief',
'sombrero', 'Indian_elephant', 'coffee_mug', 'gibbon', 'carton',
'screw', 'minibus', 'hatchet', 'window_shade', 'lawn_mower',
'washbasin', 'sock', 'prison', 'patio', 'china_cabinet',
'chain_mail', 'breakwater', 'computer_keyboard', 'goose',
'lakeside', 'solar_dish', 'table_lamp', 'Windsor_tie',
'punching_bag', 'comic_book', 'sunglass', 'great_white_shark',
'timber_wolf', 'fountain', 'dugong', 'marmot', 'barbershop',
'shovel', 'curly-coated_retriever', 'lesser_panda', 'monitor',
'crutch', 'cash_machine', 'printer', 'volcano', 'wallet', 'laptop',
'bathing_cap', 'confectionery', 'dam', 'killer_whale', 'canoe',
'Madagascar_cat', 'jean', 'boathouse', 'cliff', 'maillot', 'iPod',
'hand-held_computer', 'black_widow', 'Norwich_terrier', 'necklace',
'dining_table', 'binoculars', 'cradle', 'sea_urchin', 'cougar',
'EntleBucher', 'basketball', 'lighter', 'saltshaker', 'harvester',
'television', 'envelope', 'house_finch', 'web_site', 'palace',
'shower_curtain', 'cab', 'snorkel', 'jigsaw_puzzle', 'sweatshirt',
'white_wolf', 'sliding_door', 'academic_gown', 'cowboy_hat',
'can_opener', 'cup', 'rule', 'soccer_ball', 'bucket', 'racket',
'menu', 'purse', 'Bernese_mountain_dog', 'dumbbell', 'projectile',
'dock', 'oxygen_mask', 'sandbar', 'umbrella', 'shower_cap',
'bagel'], dtype=object)
#Checking some odd data
image_prediction.query('p2 == "envelope"').jpg_url
1626 https://pbs.twimg.com/media/Cyh5mQTW8AQpB6K.jpg 1696 https://pbs.twimg.com/media/C1SddosXUAQcVR1.jpg 1975 https://pbs.twimg.com/media/DBMV3NnXUAAm0Pp.jpg Name: jpg_url, dtype: object

image_prediction.p3.value_counts()
Labrador_retriever 79
Chihuahua 58
golden_retriever 48
Eskimo_dog 38
kelpie 35
..
ox 1
assault_rifle 1
axolotl 1
pot 1
banana 1
Name: p3, Length: 408, dtype: int64
#Getting all values of p3 column
image_prediction.p3.unique()
array(['Shetland_sheepdog', 'Rhodesian_ridgeback', 'bloodhound',
'miniature_pinscher', 'Doberman', 'Greater_Swiss_Mountain_dog',
'terrapin', 'fur_coat', 'golden_retriever',
'soft-coated_wheaten_terrier', 'Labrador_retriever', 'Pekinese',
'Ibizan_hound', 'French_bulldog', 'malinois', 'Dandie_Dinmont',
'borzoi', 'partridge', 'bookcase', 'basenji', 'miniature_poodle',
'great_grey_owl', 'groenendael', 'Eskimo_dog', 'hamster', 'briard',
'papillon', 'flat-coated_retriever', 'gar', 'Chihuahua',
'Shih-Tzu', 'Pomeranian', 'dingo', 'power_drill', 'Saluki',
'Great_Pyrenees', 'West_Highland_white_terrier', 'collie',
'toy_poodle', 'vizsla', 'acorn', 'giant_schnauzer', 'teddy',
'common_iguana', 'wig', 'water_buffalo', 'coyote', 'seat_belt',
'kelpie', 'space_heater', 'Brabancon_griffon', 'standard_poodle',
'beagle', 'Irish_water_spaniel', 'bluetick', 'Weimaraner',
'Chesapeake_Bay_retriever', 'toilet_tissue',
'black-and-tan_coonhound', 'kuvasz', 'Christmas_stocking',
'badger', 'hen', 'Staffordshire_bullterrier', 'Yorkshire_terrier',
'Lakeland_terrier', 'weasel', 'ski_mask', 'cocker_spaniel',
'Australian_terrier', 'lampshade', 'oscilloscope', 'ram', 'jeep',
'ice_bear', 'African_grey', 'Great_Dane', 'curly-coated_retriever',
'doormat', 'African_chameleon', 'schipperke', 'muzzle',
'triceratops', 'Newfoundland', 'Band_Aid', 'wood_rabbit',
'white_wolf', 'giant_panda', 'Welsh_springer_spaniel',
'French_horn', 'toy_terrier', 'Pembroke', 'Cardigan', 'bassinet',
'pug', 'Afghan_hound', 'American_Staffordshire_terrier', 'whippet',
'English_setter', 'panpipe', 'crane', 'mouse', 'titi', 'Angora',
'Boston_bull', 'silky_terrier', 'Japanese_spaniel', 'sandbar',
'balance_beam', 'black-footed_ferret', 'miniature_schnauzer',
'Blenheim_spaniel', 'bathtub', 'Saint_Bernard', 'redbone',
'goldfish', 'Norfolk_terrier', 'llama', 'koala', 'pillow',
'jersey', 'chow', 'minibus', 'malamute', 'bulletproof_vest',
'beach_wagon', 'cairn', 'plunger', 'paper_towel', 'wing',
'English_foxhound', 'Brittany_spaniel', 'bolete', 'ashcan',
'box_turtle', 'guinea_pig', 'bison', 'bull_mastiff', 'racket',
'cardoon', 'Tibetan_mastiff', 'window_screen', 'Irish_terrier',
'agama', 'common_newt', 'car_wheel', 'gorilla', 'bagel', 'clumber',
'Egyptian_cat', 'television', 'boxer', 'brown_bear', 'leafhopper',
'German_shepherd', 'Border_collie', 'menu', 'wolf_spider',
'bathing_cap', 'stinkhorn', 'drumstick', 'mask',
'Scottish_deerhound', 'shower_curtain', 'Appenzeller',
'plastic_bag', 'swimming_trunks', 'prairie_chicken', 'red_wolf',
'Maltese_dog', 'snail', 'gibbon', 'Gordon_setter', 'black_swan',
'beacon', 'wool', 'cowboy_boot', 'Rottweiler', 'poncho', 'swing',
'Arctic_fox', 'bib', 'Italian_greyhound', 'steam_locomotive',
'fountain', 'chickadee', 'abaya', 'Border_terrier', 'bubble',
'chimpanzee', 'hammerhead', 'Norwegian_elkhound',
'Norwich_terrier', 'Airedale', 'Siamese_cat', 'sea_cucumber',
'seashore', 'nipple', 'moped', 'Arabian_camel', 'crayfish',
'wallaby', 'wire-haired_fox_terrier', 'toilet_seat',
'Old_English_sheepdog', 'pajama', 'Walker_hound', 'shovel',
'bucket', 'Sealyham_terrier', 'Windsor_tie', 'Siberian_husky',
'quill', 'Persian_cat', 'European_fire_salamander',
'three-toed_sloth', 'swab', 'echidna', 'tennis_ball', 'Lhasa',
'coral_reef', 'keeshond', 'mink', 'screw', 'basset', 'wreck',
'kimono', 'German_short-haired_pointer', 'joystick', 'microwave',
'Tibetan_terrier', 'Irish_wolfhound', 'Samoyed', 'loggerhead',
'French_loaf', 'Irish_setter', 'komondor', 'purse', 'greenhouse',
'broccoli', 'shopping_basket', 'macaque', 'squirrel_monkey',
'green_lizard', 'parallel_bars', 'cloak', 'chest', 'sundial',
'mosquito_net', 'bath_towel', 'cuirass', 'zebra', 'lumbermill',
'wallet', 'feather_boa', 'English_springer', 'electric_fan',
'hippopotamus', 'ox', 'quilt', 'assault_rifle', 'axolotl', 'pot',
'toyshop', 'pizza', 'scuba_diver', 'beaver', 'Mexican_hairless',
'cliff', 'loupe', 'wild_boar', 'jaguar', 'hog', 'polecat', 'lion',
'EntleBucher', 'hand-held_computer', 'washbasin', 'whiptail',
'rock_crab', 'hare', 'shoji', 'sombrero', 'bell_cote', 'rifle',
'goose', 'pickup', 'sunglasses', 'limousine', 'bow_tie', 'pretzel',
'marmot', 'ice_lolly', 'vacuum', 'dalmatian', 'prison',
'shower_cap', 'sliding_door', 'dugong', 'otterhound', 'eel',
'binder', 'bullfrog', 'soap_dispenser', 'sea_lion', 'carton',
'brass', 'mitten', 'golfcart', 'cougar', 'warthog', 'umbrella',
'neck_brace', 'cup', 'book_jacket', 'padlock', 'cab', 'chime',
'Leonberg', 'viaduct', 'American_black_bear', 'tub', 'hand_blower',
'king_penguin', 'rotisserie', 'bannister', 'passenger_car',
'mongoose', 'dhole', 'consomme', 'valley', 'park_bench',
'mushroom', 'barrow', 'parachute', 'desktop_computer', 'snorkel',
'wok', 'affenpinscher', 'space_shuttle', 'rain_barrel',
'ballplayer', 'mountain_tent', 'oxcart', 'buckeye', 'sunglass',
'croquet_ball', 'refrigerator', 'snow_leopard', 'tripod',
'rapeseed', 'tiger_cat', 'Bernese_mountain_dog', 'notebook',
'maraca', 'pool_table', 'lakeside', 'theater_curtain', 'pier',
'cheetah', 'mousetrap', 'pop_bottle', 'soccer_ball', 'wombat',
'rhinoceros_beetle', 'paddlewheel', 'paintbrush', 'maze',
'hatchet', 'chain', 'jigsaw_puzzle', 'switch',
'Kerry_blue_terrier', 'barbell', 'convertible',
'entertainment_center', 'file', 'guillotine', 'nail',
'standard_schnauzer', 'bow', 'grocery_store', 'boathouse', 'conch',
'Bouvier_des_Flandres', 'grey_fox', 'shopping_cart', 'meerkat',
'grand_piano', 'envelope', 'screen', 'coffeepot', 'printer',
'otter', 'restaurant', 'bonnet', 'crossword_puzzle', 'go-kart',
'Sussex_spaniel', 'orangutan', 'canoe', 'barber_chair',
'traffic_light', 'ibex', 'can_opener', 'Indian_elephant',
'spatula', 'banana'], dtype=object)
image_prediction.query('p3 == "Band_Aid"').jpg_url
115 https://pbs.twimg.com/media/CUT9PuQWwAABQv7.jpg Name: jpg_url, dtype: object

#Check if there's repeated urls
image_prediction['jpg_url'].value_counts()
https://pbs.twimg.com/media/CZhn-QAWwAASQan.jpg 2
https://pbs.twimg.com/media/Cq9guJ5WgAADfpF.jpg 2
https://pbs.twimg.com/ext_tw_video_thumb/807106774843039744/pu/img/8XZg1xW35Xp2J6JW.jpg 2
https://pbs.twimg.com/media/CU1zsMSUAAAS0qW.jpg 2
https://pbs.twimg.com/media/CsrjryzWgAAZY00.jpg 2
..
https://pbs.twimg.com/media/CXrmMSpUwAAdeRj.jpg 1
https://pbs.twimg.com/media/CXrawAhWkAAWSxC.jpg 1
https://pbs.twimg.com/media/CXrIntsUsAEkv0d.jpg 1
https://pbs.twimg.com/media/CXqcOHCUQAAugTB.jpg 1
https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg 1
Name: jpg_url, Length: 2009, dtype: int64
#Count number of duplicated urls
image_prediction.jpg_url.duplicated().sum()
66
len(image_prediction.query('p1_dog == False'))
543
Completness, Accuracy, Consistency, and Validity
I - tweet_id column in wrong data type -> convert into string
III - timestamp is string data type -> convert into datetime
IV - source coulmn has HTML anchor tag -> remove anchor tag from the column
V - rating_denominator coulmn has 23 entries where it's not equal to 10 -> Drop invalid numbers
VI - rating_numerator has 119 values that are considered outliers -> Drop outliers
VII - expanded_urls column has repeated values -> Drop duplicated values
VIII - change values in 'name', 'doggo', 'floofer', 'pupper', 'puppo' columns from none into nan
IX - Remove retweets from the data frame
I - tweet_id column in wrong data type -> convert into string
II - jpg_url has duplicated values -> drop duplicated values
I - merge name, doggo, floofer, pupper, puppo into one column
twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')
image_prediction = pd.read_csv('image_prediction.tsv', sep='\t')
json_tweets = pd.read_csv('json_tweets.csv')
#Make a copy of all data to wrangle and maintain orginal state of data also for reference
df_archive = twitter_archive.copy()
df_image = image_prediction.copy()
df_json = json_tweets.copy()
df_archive = df_archive[np.isnan(df_archive.retweeted_status_id)]
df_archive.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2175 entries, 0 to 2355 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 tweet_id 2175 non-null int64 1 in_reply_to_status_id 78 non-null float64 2 in_reply_to_user_id 78 non-null float64 3 timestamp 2175 non-null object 4 source 2175 non-null object 5 text 2175 non-null object 6 retweeted_status_id 0 non-null float64 7 retweeted_status_user_id 0 non-null float64 8 retweeted_status_timestamp 0 non-null object 9 expanded_urls 2117 non-null object 10 rating_numerator 2175 non-null int64 11 rating_denominator 2175 non-null int64 12 name 2175 non-null object 13 doggo 2175 non-null object 14 floofer 2175 non-null object 15 pupper 2175 non-null object 16 puppo 2175 non-null object dtypes: float64(4), int64(3), object(10) memory usage: 305.9+ KB
df_archive.tweet_id = df_archive.tweet_id.astype(str)
df_archive.dtypes
tweet_id object in_reply_to_status_id float64 in_reply_to_user_id float64 timestamp object source object text object retweeted_status_id float64 retweeted_status_user_id float64 retweeted_status_timestamp object expanded_urls object rating_numerator int64 rating_denominator int64 name object doggo object floofer object pupper object puppo object dtype: object
droped_columns = ['in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id', 'retweeted_status_user_id', 'retweeted_status_timestamp']
df_archive.drop(droped_columns, axis = 1, inplace = True)
df_archive.columns
Index(['tweet_id', 'timestamp', 'source', 'text', 'expanded_urls',
'rating_numerator', 'rating_denominator', 'name', 'doggo', 'floofer',
'pupper', 'puppo'],
dtype='object')
format = "%Y-%m-%d %H:%M:%S"
df_archive.timestamp = pd.to_datetime(df_archive.timestamp, format = format)
df_archive.dtypes
tweet_id object timestamp datetime64[ns, UTC] source object text object expanded_urls object rating_numerator int64 rating_denominator int64 name object doggo object floofer object pupper object puppo object dtype: object
df_archive.source.value_counts()
<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a> 2042 <a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a> 91 <a href="http://twitter.com" rel="nofollow">Twitter Web Client</a> 31 <a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a> 11 Name: source, dtype: int64
iphone = '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>'
vine = '<a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a>'
web = '<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>'
tweetdeck = '<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>'
iphone_n = 'Twitter for iPhone'
vine_n = 'Vine - Make a Scene'
web_n = 'Twitter Web Client'
tweetdeck_n = 'TweetDeck'
df_archive['source'] = df_archive['source'].str.replace(iphone, iphone_n, regex = True).replace(vine, vine_n).replace(web, web_n).replace(tweetdeck, tweetdeck_n)
df_archive['source'].value_counts()
Twitter for iPhone 2042 Vine - Make a Scene 91 Twitter Web Client 31 TweetDeck 11 Name: source, dtype: int64
print('Number of rows where rating_denominator is not equal to 10 is: {}'.format(len(df_archive.query('rating_denominator != 10'))))
print('Number of rows where rating_denominator is equal to 10 is: {}'.format(len(df_archive.query('rating_denominator == 10'))))
Number of rows where rating_denominator is not equal to 10 is: 22 Number of rows where rating_denominator is equal to 10 is: 2153
V - rating_denominator coulmn has 23 entries where it's not equal to 10 -> Drop invalid numbers
df_archive = df_archive[df_archive.rating_denominator == 10]
df_archive.rating_denominator.value_counts()
10 2153 Name: rating_denominator, dtype: int64
twitter_archive = twitter_archive.query('(rating_numerator <= 15) & (rating_numerator >= 6)')
twitter_archive.rating_numerator.value_counts()
12 558 11 464 10 461 13 351 9 158 8 102 7 55 14 54 6 32 15 2 Name: rating_numerator, dtype: int64
df_archive.expanded_urls.value_counts()
https://vine.co/v/ea0OwvPTx9l 2
https://twitter.com/dog_rates/status/892420643555336193/photo/1 1
https://twitter.com/dog_rates/status/684481074559381504/photo/1 1
https://twitter.com/dog_rates/status/683834909291606017/video/1 1
https://twitter.com/dog_rates/status/683849932751646720/photo/1 1
..
https://twitter.com/dog_rates/status/759047813560868866/photo/1,https://twitter.com/dog_rates/status/759047813560868866/photo/1 1
https://twitter.com/dog_rates/status/759099523532779520/photo/1 1
https://twitter.com/dog_rates/status/759197388317847553/photo/1,https://twitter.com/dog_rates/status/759197388317847553/photo/1,https://twitter.com/dog_rates/status/759197388317847553/photo/1 1
https://twitter.com/wsaznews/status/759167558763196416 1
https://twitter.com/dog_rates/status/666020888022790149/photo/1 1
Name: expanded_urls, Length: 2098, dtype: int64
df_archive.query('expanded_urls.isna() == True')
| tweet_id | timestamp | source | text | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 30 | 886267009285017600 | 2017-07-15 16:51:35+00:00 | Twitter for iPhone | @NonWhiteHat @MayhewMayhem omg hello tanner yo... | NaN | 12 | 10 | None | None | None | None | None |
| 55 | 881633300179243008 | 2017-07-02 21:58:53+00:00 | Twitter for iPhone | @roushfenway These are good dogs but 17/10 is ... | NaN | 17 | 10 | None | None | None | None | None |
| 64 | 879674319642796034 | 2017-06-27 12:14:36+00:00 | Twitter for iPhone | @RealKentMurphy 14/10 confirmed | NaN | 14 | 10 | None | None | None | None | None |
| 113 | 870726314365509632 | 2017-06-02 19:38:25+00:00 | Twitter for iPhone | @ComplicitOwl @ShopWeRateDogs >10/10 is res... | NaN | 10 | 10 | None | None | None | None | None |
| 148 | 863427515083354112 | 2017-05-13 16:15:35+00:00 | Twitter for iPhone | @Jack_Septic_Eye I'd need a few more pics to p... | NaN | 12 | 10 | None | None | None | None | None |
| 179 | 857214891891077121 | 2017-04-26 12:48:51+00:00 | Twitter for iPhone | @Marc_IRL pixelated af 12/10 | NaN | 12 | 10 | None | None | None | None | None |
| 186 | 856288084350160898 | 2017-04-23 23:26:03+00:00 | Twitter for iPhone | @xianmcguire @Jenna_Marbles Kardashians wouldn... | NaN | 14 | 10 | None | None | None | None | None |
| 188 | 855862651834028034 | 2017-04-22 19:15:32+00:00 | Twitter for iPhone | @dhmontgomery We also gave snoop dogg a 420/10... | NaN | 420 | 10 | None | None | None | None | None |
| 189 | 855860136149123072 | 2017-04-22 19:05:32+00:00 | Twitter for iPhone | @s8n You tried very hard to portray this good ... | NaN | 666 | 10 | None | None | None | None | None |
| 218 | 850333567704068097 | 2017-04-07 13:04:55+00:00 | Twitter for iPhone | @markhoppus MARK THAT DOG HAS SEEN AND EXPERIE... | NaN | 13 | 10 | None | None | None | None | None |
| 228 | 848213670039564288 | 2017-04-01 16:41:12+00:00 | Twitter for iPhone | Jerry just apuppologized to me. He said there ... | NaN | 11 | 10 | None | None | None | None | None |
| 234 | 847617282490613760 | 2017-03-31 01:11:22+00:00 | Twitter for iPhone | .@breaannanicolee PUPDATE: Cannon has a heart ... | NaN | 13 | 10 | None | None | None | None | None |
| 274 | 840698636975636481 | 2017-03-11 22:59:09+00:00 | Twitter for iPhone | @0_kelvin_0 >10/10 is reserved for puppos s... | NaN | 10 | 10 | None | None | None | None | None |
| 290 | 838150277551247360 | 2017-03-04 22:12:52+00:00 | Twitter for iPhone | @markhoppus 182/10 | NaN | 182 | 10 | None | None | None | None | None |
| 291 | 838085839343206401 | 2017-03-04 17:56:49+00:00 | Twitter for iPhone | @bragg6of8 @Andy_Pace_ we are still looking fo... | NaN | 15 | 10 | None | None | None | None | None |
| 346 | 831926988323639298 | 2017-02-15 18:03:45+00:00 | Twitter for iPhone | @UNC can confirm 12/10 | NaN | 12 | 10 | None | None | None | None | None |
| 375 | 828361771580813312 | 2017-02-05 21:56:51+00:00 | Twitter Web Client | Beebop and Doobert should start a band 12/10 w... | NaN | 12 | 10 | None | None | None | None | None |
| 387 | 826598799820865537 | 2017-02-01 01:11:25+00:00 | Twitter for iPhone | I was going to do 007/10, but the joke wasn't ... | NaN | 7 | 10 | None | None | None | None | None |
| 409 | 823333489516937216 | 2017-01-23 00:56:15+00:00 | Twitter for iPhone | @HistoryInPics 13/10 | NaN | 13 | 10 | None | None | None | None | None |
| 427 | 821153421864615936 | 2017-01-17 00:33:26+00:00 | Twitter for iPhone | @imgur for a polar bear tho I'd say 13/10 is a... | NaN | 13 | 10 | None | None | None | None | None |
| 498 | 813130366689148928 | 2016-12-25 21:12:41+00:00 | Twitter for iPhone | I've been informed by multiple sources that th... | NaN | 12 | 10 | None | None | None | None | None |
| 513 | 811647686436880384 | 2016-12-21 19:01:02+00:00 | Twitter for iPhone | PUPDATE: I've been informed that Augie was act... | NaN | 11 | 10 | None | None | None | None | None |
| 570 | 801854953262350336 | 2016-11-24 18:28:13+00:00 | Twitter for iPhone | .@NBCSports OMG THE TINY HAT I'M GOING TO HAVE... | NaN | 11 | 10 | None | None | None | None | None |
| 576 | 800859414831898624 | 2016-11-22 00:32:18+00:00 | Twitter for iPhone | @SkyWilliams doggo simply protecting you from ... | NaN | 11 | 10 | None | doggo | None | None | None |
| 611 | 797165961484890113 | 2016-11-11 19:55:50+00:00 | Twitter for iPhone | @JODYHiGHROLLER it may be an 11/10 but what do... | NaN | 11 | 10 | None | None | None | None | None |
| 701 | 786051337297522688 | 2016-10-12 03:50:17+00:00 | Twitter for iPhone | 13/10 for breakdancing puppo @shibbnbot | NaN | 13 | 10 | None | None | None | None | puppo |
| 707 | 785515384317313025 | 2016-10-10 16:20:36+00:00 | Twitter for iPhone | Today, 10/10, should be National Dog Rates Day | NaN | 10 | 10 | None | None | None | None | None |
| 843 | 766714921925144576 | 2016-08-19 19:14:16+00:00 | Twitter for iPhone | His name is Charley and he already has a new s... | NaN | 13 | 10 | None | None | None | None | None |
| 857 | 763956972077010945 | 2016-08-12 04:35:10+00:00 | Twitter for iPhone | @TheEllenShow I'm not sure if you know this bu... | NaN | 12 | 10 | None | doggo | None | None | None |
| 967 | 750381685133418496 | 2016-07-05 17:31:49+00:00 | Twitter for iPhone | 13/10 such a good doggo\n@spaghemily | NaN | 13 | 10 | None | doggo | None | None | None |
| 1005 | 747651430853525504 | 2016-06-28 04:42:46+00:00 | Twitter for iPhone | Other pupper asked not to have his identity sh... | NaN | 12 | 10 | None | None | None | pupper | None |
| 1080 | 738891149612572673 | 2016-06-04 00:32:32+00:00 | Twitter for iPhone | @mount_alex3 13/10 | NaN | 13 | 10 | None | None | None | None | None |
| 1295 | 707983188426153984 | 2016-03-10 17:35:20+00:00 | Twitter for iPhone | @serial @MrRoles OH MY GOD I listened to all o... | NaN | 12 | 10 | None | None | None | None | None |
| 1345 | 704491224099647488 | 2016-03-01 02:19:31+00:00 | Twitter for iPhone | 13/10 hero af\n@ABC | NaN | 13 | 10 | None | None | None | None | None |
| 1445 | 696518437233913856 | 2016-02-08 02:18:30+00:00 | Twitter for iPhone | Oh my god 10/10 for every little hot dog pupper | NaN | 10 | 10 | None | None | None | pupper | None |
| 1446 | 696490539101908992 | 2016-02-08 00:27:39+00:00 | Twitter for iPhone | After reading the comments I may have overesti... | NaN | 1 | 10 | None | None | None | None | None |
| 1474 | 693644216740769793 | 2016-01-31 03:57:23+00:00 | Twitter for iPhone | BREAKING PUPDATE: I've just been notified that... | NaN | 10 | 10 | None | None | None | None | None |
| 1479 | 693582294167244802 | 2016-01-30 23:51:19+00:00 | Twitter for iPhone | Personally I'd give him an 11/10. Not sure why... | NaN | 11 | 10 | None | None | None | None | None |
| 1497 | 692423280028966913 | 2016-01-27 19:05:49+00:00 | Twitter for iPhone | PUPDATE: just noticed this dog has some extra ... | NaN | 9 | 10 | None | None | None | None | None |
| 1523 | 690607260360429569 | 2016-01-22 18:49:36+00:00 | Twitter for iPhone | 12/10 @LightningHoltt | NaN | 12 | 10 | None | None | None | None | None |
| 1605 | 685681090388975616 | 2016-01-09 04:34:45+00:00 | Twitter for iPhone | Jack deserves another round of applause. If yo... | NaN | 14 | 10 | None | None | None | None | None |
| 1618 | 684969860808454144 | 2016-01-07 05:28:35+00:00 | Twitter for iPhone | For those who claim this is a goat, u are wron... | NaN | 5 | 10 | None | None | None | None | None |
| 1689 | 681340665377193984 | 2015-12-28 05:07:27+00:00 | Twitter for iPhone | I've been told there's a slight possibility he... | NaN | 5 | 10 | None | None | None | None | None |
| 1774 | 678023323247357953 | 2015-12-19 01:25:31+00:00 | Twitter for iPhone | After getting lost in Reese's eyes for several... | NaN | 13 | 10 | None | None | None | None | None |
| 1819 | 676590572941893632 | 2015-12-15 02:32:17+00:00 | Twitter for iPhone | After some outrage from the crowd. Bubbles is ... | NaN | 7 | 10 | None | None | None | None | None |
| 1844 | 675849018447167488 | 2015-12-13 01:25:37+00:00 | Twitter for iPhone | This dog is being demoted to a 9/10 for not we... | NaN | 9 | 10 | None | None | None | None | None |
| 1895 | 674742531037511680 | 2015-12-10 00:08:50+00:00 | Twitter for iPhone | Some clarification is required. The dog is sin... | NaN | 11 | 10 | None | None | None | None | None |
| 1905 | 674606911342424069 | 2015-12-09 15:09:55+00:00 | Twitter for iPhone | The 13/10 also takes into account this impecca... | NaN | 13 | 10 | None | None | None | None | None |
| 1914 | 674330906434379776 | 2015-12-08 20:53:11+00:00 | Twitter for iPhone | 13/10\n@ABC7 | NaN | 13 | 10 | None | None | None | None | None |
| 1940 | 673716320723169284 | 2015-12-07 04:11:02+00:00 | Twitter for iPhone | The millennials have spoken and we've decided ... | NaN | 1 | 10 | None | None | None | None | None |
| 2038 | 671550332464455680 | 2015-12-01 04:44:10+00:00 | Twitter for iPhone | After 22 minutes of careful deliberation this ... | NaN | 1 | 10 | None | None | None | None | None |
| 2149 | 669684865554620416 | 2015-11-26 01:11:28+00:00 | Twitter for iPhone | After countless hours of research and hundreds... | NaN | 11 | 10 | None | None | None | None | None |
| 2189 | 668967877119254528 | 2015-11-24 01:42:25+00:00 | Twitter for iPhone | 12/10 good shit Bubka\n@wane15 | NaN | 12 | 10 | None | None | None | None | None |
| 2298 | 667070482143944705 | 2015-11-18 20:02:51+00:00 | Twitter for iPhone | After much debate this dog is being upgraded t... | NaN | 10 | 10 | None | None | None | None | None |
df_archive.expanded_urls.isna().sum()
54
duplicated_rows = df_archive[df_archive.duplicated('expanded_urls')]
duplicated_rows.head(10)
| tweet_id | timestamp | source | text | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 55 | 881633300179243008 | 2017-07-02 21:58:53+00:00 | Twitter for iPhone | @roushfenway These are good dogs but 17/10 is ... | NaN | 17 | 10 | None | None | None | None | None |
| 64 | 879674319642796034 | 2017-06-27 12:14:36+00:00 | Twitter for iPhone | @RealKentMurphy 14/10 confirmed | NaN | 14 | 10 | None | None | None | None | None |
| 113 | 870726314365509632 | 2017-06-02 19:38:25+00:00 | Twitter for iPhone | @ComplicitOwl @ShopWeRateDogs >10/10 is res... | NaN | 10 | 10 | None | None | None | None | None |
| 148 | 863427515083354112 | 2017-05-13 16:15:35+00:00 | Twitter for iPhone | @Jack_Septic_Eye I'd need a few more pics to p... | NaN | 12 | 10 | None | None | None | None | None |
| 179 | 857214891891077121 | 2017-04-26 12:48:51+00:00 | Twitter for iPhone | @Marc_IRL pixelated af 12/10 | NaN | 12 | 10 | None | None | None | None | None |
| 186 | 856288084350160898 | 2017-04-23 23:26:03+00:00 | Twitter for iPhone | @xianmcguire @Jenna_Marbles Kardashians wouldn... | NaN | 14 | 10 | None | None | None | None | None |
| 188 | 855862651834028034 | 2017-04-22 19:15:32+00:00 | Twitter for iPhone | @dhmontgomery We also gave snoop dogg a 420/10... | NaN | 420 | 10 | None | None | None | None | None |
| 189 | 855860136149123072 | 2017-04-22 19:05:32+00:00 | Twitter for iPhone | @s8n You tried very hard to portray this good ... | NaN | 666 | 10 | None | None | None | None | None |
| 218 | 850333567704068097 | 2017-04-07 13:04:55+00:00 | Twitter for iPhone | @markhoppus MARK THAT DOG HAS SEEN AND EXPERIE... | NaN | 13 | 10 | None | None | None | None | None |
| 228 | 848213670039564288 | 2017-04-01 16:41:12+00:00 | Twitter for iPhone | Jerry just apuppologized to me. He said there ... | NaN | 11 | 10 | None | None | None | None | None |
len(duplicated_rows)
54
df_archive = df_archive.dropna(subset = 'expanded_urls')
df_archive.drop_duplicates(subset = 'expanded_urls', inplace = True)
print('Number of NaN in expanded URLs colum is : {}\nNumber of rows in expanded URLs colum is : {}\nNumber of unique values in expanded URLs colum is : {}'
.format(df_archive.expanded_urls.isna().sum(),
df_archive.expanded_urls.nunique(),
len(df_archive.expanded_urls)))
Number of NaN in expanded URLs colum is : 0 Number of rows in expanded URLs colum is : 2098 Number of unique values in expanded URLs colum is : 2098
df_image.tweet_id = df_image.tweet_id.astype(str)
df_image.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2075 entries, 0 to 2074 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 tweet_id 2075 non-null object 1 jpg_url 2075 non-null object 2 img_num 2075 non-null int64 3 p1 2075 non-null object 4 p1_conf 2075 non-null float64 5 p1_dog 2075 non-null bool 6 p2 2075 non-null object 7 p2_conf 2075 non-null float64 8 p2_dog 2075 non-null bool 9 p3 2075 non-null object 10 p3_conf 2075 non-null float64 11 p3_dog 2075 non-null bool dtypes: bool(3), float64(3), int64(1), object(5) memory usage: 152.1+ KB
df_image.jpg_url.value_counts()
https://pbs.twimg.com/media/CZhn-QAWwAASQan.jpg 2
https://pbs.twimg.com/media/Cq9guJ5WgAADfpF.jpg 2
https://pbs.twimg.com/ext_tw_video_thumb/807106774843039744/pu/img/8XZg1xW35Xp2J6JW.jpg 2
https://pbs.twimg.com/media/CU1zsMSUAAAS0qW.jpg 2
https://pbs.twimg.com/media/CsrjryzWgAAZY00.jpg 2
..
https://pbs.twimg.com/media/CXrmMSpUwAAdeRj.jpg 1
https://pbs.twimg.com/media/CXrawAhWkAAWSxC.jpg 1
https://pbs.twimg.com/media/CXrIntsUsAEkv0d.jpg 1
https://pbs.twimg.com/media/CXqcOHCUQAAugTB.jpg 1
https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg 1
Name: jpg_url, Length: 2009, dtype: int64
df_image.drop_duplicates(subset = 'jpg_url', inplace = True)
df_image.jpg_url.value_counts()
https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg 1
https://pbs.twimg.com/ext_tw_video_thumb/758467147756691456/pu/img/YTNzjRFDSPNXukmM.jpg 1
https://pbs.twimg.com/media/Coy87yiWYAACtPf.jpg 1
https://pbs.twimg.com/media/CovKqSYVIAAUbUW.jpg 1
https://pbs.twimg.com/media/CouEOZhWAAAgFpE.jpg 1
..
https://pbs.twimg.com/media/CXmd_bsWkAEEXck.jpg 1
https://pbs.twimg.com/media/CXltdtaWYAIuX_V.jpg 1
https://pbs.twimg.com/media/CXlN1-EWMAQdwXK.jpg 1
https://pbs.twimg.com/media/CXk4W0qWYAMEMEs.jpg 1
https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg 1
Name: jpg_url, Length: 2009, dtype: int64
len(df_image[(df_image['p1_dog'] == False) & (df_image['p2_dog'] == False) & (df_image['p3_dog'] == False)])
318
df_image = df_image[(df_image['p1_dog'] == True) | (df_image['p2_dog'] == True) | (df_image['p2_dog'] == True)]
len(df_image[(df_image['p1_dog'] == False) & (df_image['p2_dog'] == False) & (df_image['p3_dog'] == False)])
0
df_archive.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2098 entries, 0 to 2355 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 tweet_id 2098 non-null object 1 timestamp 2098 non-null datetime64[ns, UTC] 2 source 2098 non-null object 3 text 2098 non-null object 4 expanded_urls 2098 non-null object 5 rating_numerator 2098 non-null int64 6 rating_denominator 2098 non-null int64 7 name 2098 non-null object 8 doggo 2098 non-null object 9 floofer 2098 non-null object 10 pupper 2098 non-null object 11 puppo 2098 non-null object dtypes: datetime64[ns, UTC](1), int64(2), object(9) memory usage: 213.1+ KB
I - change values in 'doggo', 'floofer', 'pupper', 'puppo' columns from none into nan and merge them into one column
df_archive = twitter_archive.copy()
df_archive.columns
Index(['tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'timestamp',
'source', 'text', 'retweeted_status_id', 'retweeted_status_user_id',
'retweeted_status_timestamp', 'expanded_urls', 'rating_numerator',
'rating_denominator', 'name', 'doggo', 'floofer', 'pupper', 'puppo'],
dtype='object')
names_columns = ['doggo', 'floofer', 'pupper', 'puppo']
names = df_archive[names_columns]
names = names.replace('None', np.NaN)
names['stage_name'] = names[names.columns[0:]].apply(
lambda x: ','.join(x.dropna().astype(str)),
axis=1
)
names.drop(names_columns, axis = 1, inplace = True)
df_archive.drop(names_columns, axis = 1, inplace = True)
df_archive.tweet_id = df_archive.tweet_id.astype(str)
df_archive = pd.concat([df_archive, names], axis = 1)
df_archive.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2237 entries, 0 to 2355 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 tweet_id 2237 non-null object 1 in_reply_to_status_id 64 non-null float64 2 in_reply_to_user_id 64 non-null float64 3 timestamp 2237 non-null object 4 source 2237 non-null object 5 text 2237 non-null object 6 retweeted_status_id 178 non-null float64 7 retweeted_status_user_id 178 non-null float64 8 retweeted_status_timestamp 178 non-null object 9 expanded_urls 2190 non-null object 10 rating_numerator 2237 non-null int64 11 rating_denominator 2237 non-null int64 12 name 2237 non-null object 13 stage_name 2237 non-null object dtypes: float64(4), int64(2), object(8) memory usage: 262.1+ KB
df_archive['stage_name'] = df_archive['stage_name'].replace('', np.NaN)
df_archive.stage_name.isna().sum()
1861
df_archive.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2237 entries, 0 to 2355 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 tweet_id 2237 non-null object 1 in_reply_to_status_id 64 non-null float64 2 in_reply_to_user_id 64 non-null float64 3 timestamp 2237 non-null object 4 source 2237 non-null object 5 text 2237 non-null object 6 retweeted_status_id 178 non-null float64 7 retweeted_status_user_id 178 non-null float64 8 retweeted_status_timestamp 178 non-null object 9 expanded_urls 2190 non-null object 10 rating_numerator 2237 non-null int64 11 rating_denominator 2237 non-null int64 12 name 2237 non-null object 13 stage_name 376 non-null object dtypes: float64(4), int64(2), object(8) memory usage: 262.1+ KB
df_json = df_json.rename(columns = {'id':'tweet_id'})
df_json.tweet_id = df_json.tweet_id.astype(str)
df_json.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1615 entries, 0 to 1614 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 tweet_id 1615 non-null object 1 retweet_count 1615 non-null int64 2 favorite_count 1615 non-null int64 dtypes: int64(2), object(1) memory usage: 38.0+ KB
df = df_archive.merge(df_image, how = 'outer', on = 'tweet_id').merge(df_json, how = 'outer', on = 'tweet_id')
df.head()
| tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | ... | p1_conf | p1_dog | p2 | p2_conf | p2_dog | p3 | p3_conf | p3_dog | retweet_count | favorite_count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 892420643555336193 | NaN | NaN | 2017-08-01 16:23:56 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Phineas. He's a mystical boy. Only eve... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/892420643... | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6973.0 | 33703.0 |
| 1 | 892177421306343426 | NaN | NaN | 2017-08-01 00:17:27 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Tilly. She's just checking pup on you.... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/892177421... | ... | 0.323581 | True | Pekinese | 0.090647 | True | papillon | 0.068957 | True | 5276.0 | 29229.0 |
| 2 | 891815181378084864 | NaN | NaN | 2017-07-31 00:18:03 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Archie. He is a rare Norwegian Pouncin... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891815181... | ... | 0.716012 | True | malamute | 0.078253 | True | kelpie | 0.031379 | True | 3465.0 | 21976.0 |
| 3 | 891689557279858688 | NaN | NaN | 2017-07-30 15:58:51 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Darla. She commenced a snooze mid meal... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891689557... | ... | 0.170278 | False | Labrador_retriever | 0.168086 | True | spatula | 0.040836 | False | 7193.0 | 36799.0 |
| 4 | 891327558926688256 | NaN | NaN | 2017-07-29 16:00:24 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Franklin. He would like you to stop ca... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891327558... | ... | 0.555712 | True | English_springer | 0.225770 | True | German_short-haired_pointer | 0.175219 | True | 7719.0 | 35186.0 |
5 rows × 27 columns
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2340 entries, 0 to 2339 Data columns (total 27 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 tweet_id 2340 non-null object 1 in_reply_to_status_id 64 non-null float64 2 in_reply_to_user_id 64 non-null float64 3 timestamp 2237 non-null object 4 source 2237 non-null object 5 text 2237 non-null object 6 retweeted_status_id 178 non-null float64 7 retweeted_status_user_id 178 non-null float64 8 retweeted_status_timestamp 178 non-null object 9 expanded_urls 2190 non-null object 10 rating_numerator 2237 non-null float64 11 rating_denominator 2237 non-null float64 12 name 2237 non-null object 13 stage_name 376 non-null object 14 jpg_url 1629 non-null object 15 img_num 1629 non-null float64 16 p1 1629 non-null object 17 p1_conf 1629 non-null float64 18 p1_dog 1629 non-null object 19 p2 1629 non-null object 20 p2_conf 1629 non-null float64 21 p2_dog 1629 non-null object 22 p3 1629 non-null object 23 p3_conf 1629 non-null float64 24 p3_dog 1629 non-null object 25 retweet_count 1615 non-null float64 26 favorite_count 1615 non-null float64 dtypes: float64(12), object(15) memory usage: 511.9+ KB
df.to_csv('twitter_archive_master.csv', index = False)
df.img_num.value_counts()
1.0 1385 2.0 165 3.0 53 4.0 26 Name: img_num, dtype: int64
df.describe()
| in_reply_to_status_id | in_reply_to_user_id | retweeted_status_id | retweeted_status_user_id | rating_numerator | rating_denominator | img_num | p1_conf | p2_conf | p3_conf | retweet_count | favorite_count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 6.400000e+01 | 6.400000e+01 | 1.780000e+02 | 1.780000e+02 | 2237.000000 | 2237.000000 | 1629.000000 | 1629.000000 | 1629.000000 | 1.629000e+03 | 1615.000000 | 1615.000000 |
| mean | 7.449206e+17 | 2.454770e+16 | 7.731397e+17 | 1.262626e+16 | 10.985248 | 10.003576 | 1.214242 | 0.606673 | 0.138292 | 6.175369e-02 | 2767.946749 | 8057.959133 |
| std | 7.465588e+16 | 1.381047e+17 | 6.187355e+16 | 9.678889e+16 | 1.661211 | 0.111846 | 0.573916 | 0.265185 | 0.101527 | 5.215186e-02 | 4253.716518 | 11845.717164 |
| min | 6.658147e+17 | 1.185634e+07 | 6.671383e+17 | 7.832140e+05 | 6.000000 | 10.000000 | 1.000000 | 0.044333 | 0.000010 | 2.160900e-07 | 1.000000 | 0.000000 |
| 25% | 6.754602e+17 | 4.400207e+08 | 7.273831e+17 | 4.196984e+09 | 10.000000 | 10.000000 | 1.000000 | 0.379797 | 0.055527 | 1.596500e-02 | 406.000000 | 730.000000 |
| 50% | 7.062329e+17 | 4.196984e+09 | 7.805335e+17 | 4.196984e+09 | 11.000000 | 10.000000 | 1.000000 | 0.607401 | 0.122019 | 5.059200e-02 | 1484.000000 | 2960.000000 |
| 75% | 8.216960e+17 | 4.196984e+09 | 8.206409e+17 | 4.196984e+09 | 12.000000 | 10.000000 | 1.000000 | 0.852088 | 0.199619 | 9.521820e-02 | 3397.500000 | 11175.000000 |
| max | 8.862664e+17 | 8.405479e+17 | 8.874740e+17 | 7.874618e+17 | 15.000000 | 15.000000 | 4.000000 | 0.999984 | 0.467678 | 2.734190e-01 | 51446.000000 | 123705.000000 |
labels = ['iPhone', 'Vine', 'Twitter Web Client', 'TweetDeck']
plt.figure(figsize = (10,10))
plt.pie(df['source'].value_counts(), labels = labels, autopct = '%1.2f', textprops={'fontsize': 18})
plt.yticks(fontsize=20)
plt.title('Most popular source', fontsize = 18);
plt.figure(figsize = (15,25))
breed = df.groupby('p1').filter(lambda x: len(x) > 10)
breed.p1.value_counts(normalize = True).plot(kind = 'barh');
plt.title('Most popular dog breed')
plt.xlabel('Count')
plt.ylabel('Dog Breed');
plt.figure(figsize = (10,10))
sns.regplot(data = df, x = 'retweet_count', y = 'favorite_count',
scatter_kws={"color": "blue"}, line_kws={"color": "red"})
plt.title('Corelation between retweet count and favorite count')
plt.xlabel('Retweets Count')
plt.ylabel('Favorite Count');